Objectives

Multiple Linear Regression

homes <- read_csv("slo_homes.csv") %>% 
  janitor::clean_names() %>% 
  filter(city %in% c("San Luis Obispo", "Atascadero", "Arroyo Grande"))
## Parsed with column specification:
## cols(
##   City = col_character(),
##   Price = col_double(),
##   Bedrooms = col_double(),
##   Bathrooms = col_double(),
##   SqFt = col_double(),
##   PricePerSqFt = col_double(),
##   Status = col_character()
## )

Look at correlations between numeric variables (checking to see if we think collinearity might be a concern). Ignore price_per_sq_ft since it’s a calculated metric.

# Creates correlation matrix:
homes_cor <- cor(homes[2:6])
homes_cor
##                     price    bedrooms bathrooms     sq_ft price_per_sq_ft
## price           1.0000000  0.31152578 0.4991453 0.6618575      0.73664029
## bedrooms        0.3115258  1.00000000 0.7519186 0.6960233     -0.09994633
## bathrooms       0.4991453  0.75191865 1.0000000 0.8046112      0.07624770
## sq_ft           0.6618575  0.69602326 0.8046112 1.0000000      0.12549708
## price_per_sq_ft 0.7366403 -0.09994633 0.0762477 0.1254971      1.00000000

Make a correlogram (plot of correlations):

corrplot(homes_cor)

beep(2)

Start with a complete model including all variables for now:

# linear model with outcome as price (assumes no interaction)
home_lm <- lm(price ~ city + bedrooms + bathrooms + sq_ft + status, 
              data = homes) 
home_lm
## 
## Call:
## lm(formula = price ~ city + bedrooms + bathrooms + sq_ft + status, 
##     data = homes)
## 
## Coefficients:
##         (Intercept)       cityAtascadero  citySan Luis Obispo  
##            184130.9            -167396.4              31018.1  
##            bedrooms            bathrooms                sq_ft  
##           -161645.5              48692.0                389.1  
##       statusRegular     statusShort Sale  
##            303964.6             -19828.6

Let’s interpret the model results.

(Tip: A good guideline is at least 20 samples on each variable level you include in the regression.)

home_lm2 <- lm(price ~ city + sq_ft + status, 
               data = homes)
home_lm2
## 
## Call:
## lm(formula = price ~ city + sq_ft + status, data = homes)
## 
## Coefficients:
##         (Intercept)       cityAtascadero  citySan Luis Obispo  
##             -105821              -182587                70279  
##               sq_ft        statusRegular     statusShort Sale  
##                 325               315441                31284

On average, home price increases by $325 for every 1 sq ft increase in home size. (This is still pretty pricey, but makes sense for the central coast.)

AIC values: This tells you about the balance between model fit and model complexity. Better model fit indicated by lower AIC value.

AIC(home_lm) # values not meaningful on their own, but useful in comparison...
## [1] 3576.834
AIC(home_lm2)
## [1] 3584.3
# Be careful: AIC is not useful on its own! Consider your conceptual evidence from your interpretation of the data.

Let’s check out the adjusted R-squared value to see model fit.

summary(home_lm) # don't throw out coefficients based on significance alone!
## 
## Call:
## lm(formula = price ~ city + bedrooms + bathrooms + sq_ft + status, 
##     data = homes)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -843938 -115963  -12298  109718 3445077 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          184130.93  143368.84   1.284  0.20157    
## cityAtascadero      -167396.39   78701.95  -2.127  0.03552 *  
## citySan Luis Obispo   31018.14  114895.10   0.270  0.78766    
## bedrooms            -161645.51   49414.32  -3.271  0.00141 ** 
## bathrooms             48692.04   52408.63   0.929  0.35476    
## sq_ft                   389.12      53.17   7.318 3.43e-11 ***
## statusRegular        303964.64  105942.81   2.869  0.00488 ** 
## statusShort Sale     -19828.56   86335.35  -0.230  0.81875    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 380600 on 117 degrees of freedom
## Multiple R-squared:  0.5682, Adjusted R-squared:  0.5424 
## F-statistic:    22 on 7 and 117 DF,  p-value: < 2.2e-16
summary(home_lm2) # simpler model, with coefficients that make sense, but slightly smaller adjusted R-squared
## 
## Call:
## lm(formula = price ~ city + sq_ft + status, data = homes)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -653118 -120914   -8844  101402 3644738 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -105820.71  118697.40  -0.892   0.3745    
## cityAtascadero      -182586.90   80683.44  -2.263   0.0254 *  
## citySan Luis Obispo   70278.97  115846.31   0.607   0.5452    
## sq_ft                   325.03      31.54  10.306   <2e-16 ***
## statusRegular        315441.26  109749.65   2.874   0.0048 ** 
## statusShort Sale      31284.44   87356.11   0.358   0.7209    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 395100 on 119 degrees of freedom
## Multiple R-squared:  0.5268, Adjusted R-squared:  0.5069 
## F-statistic: 26.49 on 5 and 119 DF,  p-value: < 2.2e-16

Let’s ask stargazer package to create the summary regression table for us! Such a nightmare by hand.

# results 'asis' because stargazer outputs to HTML already

stargazer(home_lm, home_lm2, type = "html")
Dependent variable:
price
(1) (2)
cityAtascadero -167,396.400** -182,586.900**
(78,701.950) (80,683.440)
citySan Luis Obispo 31,018.140 70,278.970
(114,895.100) (115,846.300)
bedrooms -161,645.500***
(49,414.320)
bathrooms 48,692.040
(52,408.630)
sq_ft 389.120*** 325.028***
(53.171) (31.538)
statusRegular 303,964.600*** 315,441.300***
(105,942.800) (109,749.700)
statusShort Sale -19,828.560 31,284.440
(86,335.350) (87,356.110)
Constant 184,130.900 -105,820.700
(143,368.800) (118,697.400)
Observations 125 125
R2 0.568 0.527
Adjusted R2 0.542 0.507
Residual Std. Error 380,586.300 (df = 117) 395,084.400 (df = 119)
F Statistic 21.997*** (df = 7; 117) 26.491*** (df = 5; 119)
Note: p<0.1; p<0.05; p<0.01
# can also open this HTML table in Word to customize it, but can also customize thru stargazer directly (might be a pain)

Now let’s check out diagnostic plots to see if our assumptions (normality of residuals and homoscedasticity) are satisfied or if we’re really concerned:

plot(home_lm2)

Make some home price predictions with new data. We’re going to use that simplified model (home_lm2).

new_df <- data.frame(city = rep(c("San Luis Obispo", "Arroyo Grande", "Atascadero"), each = 10),
                     sq_ft = rep(seq(0, 5000, length = 10)),
                     status = "Regular")

Use predict function to make new predictions using home_lm2:

predict_df <- predict(home_lm2, newdata = new_df)

Bind that together with the new_df:

full_df <- data.frame(new_df, predict_df)

Make a graph with the actual data, and what our model actually predicts:

# if pulling from two sources, start with empty ggplot() call and add the data
ggplot() +
  geom_point(data = homes,
             aes(x = sq_ft, 
                 y = price, 
                 color = city, 
                 pch = city)) +
  geom_line(data = full_df,
            aes(x = sq_ft,
                y = predict_df,
                color = city)) +
  theme_light()

ggplot(data = homes, aes(x = sq_ft, y = price)) +
  geom_point() +
  geom_smooth(method = lm) +
  facet_wrap(~city)

Intro to the sf package

sf = simple features

Has sticky geometry! Maintains the spatial information in the dataframe. This means we can just work with attributes like a normal data frame for wrangling, viz, etc.

Get the California dams data:

dams <-  read_csv("ca_dams.csv") %>% 
  clean_names() %>% 
  drop_na(latitude) %>% # does complete row deletion for whatever variable you indicate with an 'na'
  drop_na(longitude) %>% 
  drop_na(year_completed)
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   RECORDID = col_double(),
##   STATEID = col_logical(),
##   LONGITUDE = col_double(),
##   LATITUDE = col_double(),
##   DISTANCE = col_double(),
##   YEAR_COMPLETED = col_double(),
##   DAM_LENGTH = col_double(),
##   DAM_HEIGHT = col_double(),
##   STRUCTURAL_HEIGHT = col_double(),
##   HYDRAULIC_HEIGHT = col_double(),
##   NID_HEIGHT = col_double(),
##   MAX_DISCHARGE = col_double(),
##   MAX_STORAGE = col_double(),
##   NORMAL_STORAGE = col_double(),
##   NID_STORAGE = col_double(),
##   SURFACE_AREA = col_double(),
##   DRAINAGE_AREA = col_double(),
##   INSPECTION_FREQUENCY = col_double(),
##   SPILLWAY_WIDTH = col_double(),
##   VOLUME = col_double()
##   # ... with 6 more columns
## )
## See spec(...) for full column specifications.
## Warning: 109 parsing failures.
##  row       col           expected actual          file
## 1337 FED_OTHER 1/0/T/F/TRUE/FALSE     CE 'ca_dams.csv'
## 1338 FED_OTHER 1/0/T/F/TRUE/FALSE     CE 'ca_dams.csv'
## 1339 FED_OTHER 1/0/T/F/TRUE/FALSE     CE 'ca_dams.csv'
## 1340 FED_OTHER 1/0/T/F/TRUE/FALSE     CE 'ca_dams.csv'
## 1341 FED_OTHER 1/0/T/F/TRUE/FALSE     CE 'ca_dams.csv'
## .... ......... .................. ...... .............
## See problems(...) for more details.

Convert lat/long numbers to simple features data (sf)

dams_sf <- st_as_sf(dams, coords = c("longitude", "latitude")) # indicate where your long/lat values are
st_crs(dams_sf) <- 4326 # set coordinate reference system

plot(dams_sf) # basic plot works OK for initial review
## Warning: plotting the first 9 out of 67 attributes; use max.plot = 67 to
## plot all
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf

Now let’s read in the shapefile data for the state of CA (outer boundary TIGER shapefile data).

ca_border <- read_sf(here::here("ca_state_border"), layer = "CA_State_TIGER2016")
plot(ca_border) # if you look at the table for ca_border, it holds the entire geometry of the polygon in 'geometry'
## Warning: plotting the first 9 out of 14 attributes; use max.plot = 14 to
## plot all

Now, plot dams in California over the CA outline, in ggplot:

ggplot() +
  geom_sf(data = ca_border, fill = "slate grey") +
  geom_sf(data = dams_sf,
          aes(size = dam_height),
          alpha = 0.5,
          color = "dark red",
          show.legend = FALSE)

Let’s make an animated plot:

ggplot() +
  geom_sf(data = ca_border) +
  geom_sf(data = dams_sf, size = 1, color = "gray50") +
  theme_void() +
  transition_time(year_completed) + # animation time; give it the variable that contains that info
  shadow_mark(alpha = 1) # by default shows up & disappears, this makes sure they stay